# Where the files live online ...remote_files <-glue('{root}/camels_{types}.txt')# where we want to download the data ...local_files <-glue('data/camels_{types}.txt')
scale_color_manual(values =c("purple", "magenta", "lightpink")) #lets you pick your own colors.
<ggproto object: Class ScaleDiscrete, Scale, gg>
aesthetics: colour
axis_order: function
break_info: function
break_positions: function
breaks: waiver
call: call
clone: function
dimension: function
drop: TRUE
expand: waiver
get_breaks: function
get_breaks_minor: function
get_labels: function
get_limits: function
get_transformation: function
guide: legend
is_discrete: function
is_empty: function
labels: waiver
limits: NULL
make_sec_title: function
make_title: function
map: function
map_df: function
n.breaks.cache: NULL
na.translate: TRUE
na.value: grey50
name: waiver
palette: function
palette.cache: NULL
position: left
range: environment
rescale: function
reset: function
train: function
train_df: function
transform: function
transform_df: function
super: <ggproto object: Class ScaleDiscrete, Scale, gg>
zero_q_freq: frequency of days with Q = 0 mm/day (it’s reported in a percentage)
Question 2
library(ggplot2)library(ggthemes)library(maps)
Attaching package: 'maps'
The following object is masked from 'package:purrr':
map
# Create a scatter plot of aridity vs rainfallggplot(camels, aes(x = aridity, y = p_mean)) +# Add points colored by mean flowgeom_point(aes(color = q_mean)) +# Add a linear regression linegeom_smooth(method ="lm", color ="red", linetype =2) +# Apply the viridis color scalescale_color_viridis_c() +# Add a title, axis labels, and theme (w/ legend on the bottom)theme_linedraw() +theme(legend.position ="bottom") +labs(title ="Aridity vs Rainfall vs Runnoff", x ="Aridity", y ="Rainfall",color ="Mean Flow")
`geom_smooth()` using formula = 'y ~ x'
ggplot(camels, aes(x = aridity, y = p_mean)) +geom_point(aes(color = q_mean)) +geom_smooth(method ="lm") +scale_color_viridis_c() +# Apply log transformations to the x and y axesscale_x_log10() +scale_y_log10() +theme_linedraw() +theme(legend.position ="bottom") +labs(title ="Aridity vs Rainfall vs Runnoff", x ="Aridity", y ="Rainfall",color ="Mean Flow")
`geom_smooth()` using formula = 'y ~ x'
ggplot(camels, aes(x = aridity, y = p_mean)) +geom_point(aes(color = q_mean)) +geom_smooth(method ="lm") +# Apply a log transformation to the color scalescale_color_viridis_c(trans ="log") +scale_x_log10() +scale_y_log10() +theme_linedraw() +theme(legend.position ="bottom",# Expand the legend width ...legend.key.width =unit(2.5, "cm"),legend.key.height =unit(.5, "cm")) +labs(title ="Aridity vs Rainfall vs Runnoff", x ="Aridity", y ="Rainfall",color ="Mean Flow")
`geom_smooth()` using formula = 'y ~ x'
set.seed(123)# Bad form to perform simple transformations on the outcome variable within a # recipe. So, we'll do it here.camels <- camels |>mutate(logQmean =log(q_mean))# Generate the splitcamels_split <-initial_split(camels, prop =0.8)camels_train <-training(camels_split)camels_test <-testing(camels_split)camels_cv <-vfold_cv(camels_train, v =10)
# Create a recipe to preprocess the datarec <-recipe(logQmean ~ aridity + p_mean, data = camels_train) %>%# Log transform the predictor variables (aridity and p_mean)step_log(all_predictors()) %>%# Add an interaction term between aridity and p_meanstep_interact(terms =~ aridity:p_mean) |># Drop any rows with missing values in the predstep_naomit(all_predictors(), all_outcomes())
# Prepare the databaked_data <-prep(rec, camels_train) |>bake(new_data =NULL)# Interaction with lm# Base lm sets interaction terms with the * symbollm_base <-lm(logQmean ~ aridity * p_mean, data = baked_data)summary(lm_base)
Call:
lm(formula = logQmean ~ aridity * p_mean, data = baked_data)
Residuals:
Min 1Q Median 3Q Max
-2.91162 -0.21601 -0.00716 0.21230 2.85706
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.77586 0.16365 -10.852 < 2e-16 ***
aridity -0.88397 0.16145 -5.475 6.75e-08 ***
p_mean 1.48438 0.15511 9.570 < 2e-16 ***
aridity:p_mean 0.10484 0.07198 1.457 0.146
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.5696 on 531 degrees of freedom
Multiple R-squared: 0.7697, Adjusted R-squared: 0.7684
F-statistic: 591.6 on 3 and 531 DF, p-value: < 2.2e-16
# Sanity Interaction term from recipe ... these should be equal!!summary(lm(logQmean ~ aridity + p_mean + aridity_x_p_mean, data = baked_data))
Call:
lm(formula = logQmean ~ aridity + p_mean + aridity_x_p_mean,
data = baked_data)
Residuals:
Min 1Q Median 3Q Max
-2.91162 -0.21601 -0.00716 0.21230 2.85706
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.77586 0.16365 -10.852 < 2e-16 ***
aridity -0.88397 0.16145 -5.475 6.75e-08 ***
p_mean 1.48438 0.15511 9.570 < 2e-16 ***
aridity_x_p_mean 0.10484 0.07198 1.457 0.146
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.5696 on 531 degrees of freedom
Multiple R-squared: 0.7697, Adjusted R-squared: 0.7684
F-statistic: 591.6 on 3 and 531 DF, p-value: < 2.2e-16
Warning in bake.step_log(step, new_data = new_data): NaNs produced
model_metrics <-bind_rows(metrics(test_data, truth = logQmean, estimate = lm_pred) %>%mutate(model ="Linear Regression"),metrics(test_data, truth = logQmean, estimate = rf_pred) %>%mutate(model ="Random Forest"),metrics(test_data, truth = logQmean, estimate = xgb_pred) %>%mutate(model ="XGBoost"),metrics(test_data, truth = logQmean, estimate = nnet_pred) %>%mutate(model ="Neural Network"))print(model_metrics)
# A tibble: 12 × 4
.metric .estimator .estimate model
<chr> <chr> <dbl> <chr>
1 rmse standard 0.583 Linear Regression
2 rsq standard 0.742 Linear Regression
3 mae standard 0.390 Linear Regression
4 rmse standard 0.792 Random Forest
5 rsq standard 0.529 Random Forest
6 mae standard 0.590 Random Forest
7 rmse standard 2.16 XGBoost
8 rsq standard 0.454 XGBoost
9 mae standard 2.02 XGBoost
10 rmse standard 7.25 Neural Network
11 rsq standard 0.289 Neural Network
12 mae standard 7.18 Neural Network
Linear regression seems like it would be the best for this!
There were issues with some computations A: x30 B: x1
There were issues with some computations A: x30 B: x20 C: x32
There were issues with some computations A: x30 B: x20 C: x40
✔ 2 of 3 tuning: recipe_XGBoost (4.8s)
i 3 of 3 tuning: recipe_Neural Network
✔ 3 of 3 tuning: recipe_Neural Network (22.6s)
collect_metrics(model_fit)
# A tibble: 90 × 9
wflow_id .config preproc model .metric .estimator mean n std_err
<chr> <chr> <chr> <chr> <chr> <chr> <dbl> <int> <dbl>
1 recipe_Random … Prepro… recipe rand… mae standard 0.0841 1 NA
2 recipe_Random … Prepro… recipe rand… rmse standard 0.141 1 NA
3 recipe_Random … Prepro… recipe rand… rsq standard 0.945 1 NA
4 recipe_Random … Prepro… recipe rand… mae standard 0.102 1 NA
5 recipe_Random … Prepro… recipe rand… rmse standard 0.164 1 NA
6 recipe_Random … Prepro… recipe rand… rsq standard 0.928 1 NA
7 recipe_Random … Prepro… recipe rand… mae standard 0.0445 1 NA
8 recipe_Random … Prepro… recipe rand… rmse standard 0.0832 1 NA
9 recipe_Random … Prepro… recipe rand… rsq standard 0.979 1 NA
10 recipe_Random … Prepro… recipe rand… mae standard 0.0585 1 NA
# ℹ 80 more rows
# 2. Get best XGBoost model from tuning resultsbest_xgb <- model_fit %>%extract_workflow_set_result("recipe_XGBoost") %>%select_best(metric ="rsq")final_xgb <-finalize_model(xgb_model, best_xgb)
# 3. Build and fit the workflow with the final model and recipefinal_xgb_wf <-workflow() %>%add_model(final_xgb) %>%add_recipe(rec)final_xgb_fit <- final_xgb_wf %>%fit(data = camels_train)
# 4. Clean test setcamels_test_clean <- camels_test %>%filter(!is.na(q_mean), q_mean >0)
ggplot(xgb_preds, aes(x = q_mean_pred, y = q_mean_obs)) +geom_point(alpha =0.7, color ="mediumorchid") +geom_abline(slope =1, intercept =0, linetype ="dashed", color ="red") +scale_x_log10() +scale_y_log10() +labs(title ="XGBoost — Observed vs Predicted Streamflow",x ="Predicted q_mean",y ="Observed q_mean" ) +theme_minimal()
The XGBoost model worked well on the test data, it predicted values that are very similar to the observed streamflow values. The log–log scatterplot shows a cluster of points around the 1:1 line which shows accurate predictions. There is not a lot of spread, even at the low and high extremes which means the model generalizes well and shows key hydrological patterns. This supports the high R² score shown during cross-validation and reinforces that XGBoost as a good choice for modeling streamflow in this dataset.